We want to look at real datasets to simulate realistic datasets.
filterData = F
if (filterData){
cluster = read.csv('retina_clusteridentities.txt', sep = '\t',
stringsAsFactors = F, header = F)
rownames(cluster) = cluster$V1
dropseq = read.csv('GSE63472_P14Retina_merged_digital_expression.txt',
sep = '\t', stringsAsFactors = F)
prefilter = dropseq[, colnames(dropseq) %in% cluster$V1]
selectCluster = cluster[colnames(prefilter), 'V2'] %in% c(1, 2, 7, 32, 37)
prefilter = prefilter[, selectCluster]
filter <- apply(prefilter > 10, 1, sum) >= 10
postfilter <- prefilter[filter,]
write.csv(postfilter, 'GSE63472_P14Retina_merged_digital_expression_filt.csv')
}
core = read.csv('GSE63472_P14Retina_merged_digital_expression_filt.csv',
stringsAsFactors = F)
core = as.matrix(core)
cluster = read.csv('retina_clusteridentities.txt', sep = '\t',
stringsAsFactors = F, header = F)
rownames(cluster) = cluster$V1
bio = factor(cluster[colnames(core), 'V2'])
detection_rate <- colSums(core>0)
coverage <- colSums(core)
We will color-code the cells by inferred cluster (see paper ‘Highly Parallel Genome-wide Expression Profiling of Individual Cells Using Nanoliter Droplets’). Select cells from clusters 1,2,7,32, and 37 (corresponding to different cell classes in the retina), filter out the genes that do not have at least 10 counts in at least 10 cells. Dimensions of the dataset is
dim(core)
## [1] 703 1583
Note that after filtering, the number of genes is small than the number of cells (like in Zeisel dataset).
par(mfrow = c(1,2))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(core == 0), xlim = c(0,4), ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
main = '1,000 most variable genes')
smoothScatter(mean, rowMeans(core == 0), xlim = c(0,4),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = '1,000 most variable genes')
Fit data with K = 4, V and X intercepts in both Mu and Pi, and commondispersion = FALSE.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 4,
commondispersion = FALSE)))
## user system elapsed
## 4367.869 1791.992 2296.210
If we simulate W from real data, it will look like that.
pairs(zinb@W, col = cols[bio], pch = 19, main = "ZINB")
gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]
df <- data.frame(gamma_mu = gamma_mu,
gamma_pi = gamma_pi,
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.3535422 0.9075246 0.9488025
## gamma_pi -0.3535422 1.0000000 -0.5263647 -0.4920994
## detection_rate 0.9075246 -0.5263647 1.0000000 0.9740914
## coverage 0.9488025 -0.4920994 0.9740914 1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = bio)
gamma = melt(gamma)
ggplot(gamma, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 50, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable) + xlab('gamma')
ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
facet_grid(~ variable) + xlab('gamma') + theme_bw()
beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]
df <- data.frame(beta_mu = beta_mu,
beta_pi = beta_pi)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.2221312
## beta_pi -0.2221312 1.0000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable, scales = 'free') + xlab('beta')
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot(outlier.shape = NA) +
facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta removing outliers') +
scale_x_discrete(breaks = c('', ''), drop=FALSE) +
scale_y_continuous(limits = c(min, max))
## Warning: Removed 838 rows containing non-finite values (stat_boxplot).
pairs(t(zinb@alpha_mu), main = 'alpha_mu')
pairs(t(zinb@alpha_pi), main = 'alpha_pi')
df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
alpha_mu_2 = zinb@alpha_mu[2, ],
alpha_mu_3 = zinb@alpha_mu[3, ],
alpha_mu_4 = zinb@alpha_mu[4, ],
alpha_pi_1 = zinb@alpha_pi[1, ],
alpha_pi_2 = zinb@alpha_pi[2, ],
alpha_pi_3 = zinb@alpha_pi[3, ],
alpha_pi_4 = zinb@alpha_pi[4, ])
pairs(df, pch=19)
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_mu_3 alpha_mu_4 alpha_pi_1
## alpha_mu_1 1.00000000 -0.16924097 -0.439265936 0.096582896 -0.09114714
## alpha_mu_2 -0.16924097 1.00000000 0.283156270 -0.128433814 -0.19545680
## alpha_mu_3 -0.43926594 0.28315627 1.000000000 -0.001863545 0.10310579
## alpha_mu_4 0.09658290 -0.12843381 -0.001863545 1.000000000 0.25073828
## alpha_pi_1 -0.09114714 -0.19545680 0.103105786 0.250738282 1.00000000
## alpha_pi_2 -0.14822571 0.18279627 0.126350702 -0.261374516 -0.35861884
## alpha_pi_3 0.22894955 -0.01948612 -0.229933375 0.057926624 0.01539616
## alpha_pi_4 -0.15200914 0.21313809 0.115836116 -0.363158579 -0.37953663
## alpha_pi_2 alpha_pi_3 alpha_pi_4
## alpha_mu_1 -0.1482257 0.22894955 -0.15200914
## alpha_mu_2 0.1827963 -0.01948612 0.21313809
## alpha_mu_3 0.1263507 -0.22993337 0.11583612
## alpha_mu_4 -0.2613745 0.05792662 -0.36315858
## alpha_pi_1 -0.3586188 0.01539616 -0.37953663
## alpha_pi_2 1.0000000 -0.14502794 0.44001938
## alpha_pi_3 -0.1450279 1.00000000 -0.04517799
## alpha_pi_4 0.4400194 -0.04517799 1.00000000
par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
xlab = 'edgeR tagwise dispersion', main = 'Dispersion')
print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.589623
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')
par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=cols[bio], ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=cols[bio])
par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(0,4),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'True')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(0,4),
xlab = "Estimated Mean Expression", main = 'Estimated',
ylab = "Estimated Dropout Rate",ylim = c(0,1),
colramp = pal)
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
xlab = "Estimated Mean Expression", xlim = c(0,4),
ylab = "Estimated Dropout Rate", pch=19,
ylim = c(0, 1), main = 'Estimated')